#no need to refresh kernel when changes are made to the helper scripts
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import sys
from IPython.display import display,FileLink, Markdown, HTML, Image
cwd = os.path.dirname(os.getcwd()) #add the cwd so that python scripts can be imported.
if cwd not in sys.path:
sys.path.insert(0, cwd)
#to save in the same directory as the notebook, change to resource_path="".
# print(project_root)
# print(resource_path)
#parameters
gse = "GSE247175"
project_root = cwd
# Parameters
gse = "GSE247175"
project_root = "/home/ajy20/geo2reports"
from pathlib import Path
resource_path = os.path.join(project_root, "output", gse)
Path(resource_path).mkdir(parents=True, exist_ok=True)
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
''' Usage:
with suppress_output():
print('hi')
'''
dev_null = open(dest, 'a')
if stdout:
_stdout = sys.stdout
sys.stdout = dev_null
if stderr:
_stderr = sys.stderr
sys.stderr = dev_null
try:
yield
finally:
if stdout:
sys.stdout = _stdout
if stderr:
sys.stderr = _stderr
from Bio import Entrez
from dotenv import load_dotenv
load_dotenv()
os.environ['ENTREZ_EMAIL'] = os.getenv('ENTREZ_EMAIL')
Entrez.email = os.environ['ENTREZ_EMAIL']
id_handle = Entrez.esearch(db="gds", term=f"{gse}[Accession]", retmax=1)
id_record = Entrez.read(id_handle)
gds_id = id_record["IdList"][0]
stream = Entrez.esummary(db='gds', id=gds_id)
record = Entrez.read(stream)
map_species = {
"homo sapiens": "human",
"mus musculus": "mouse"
}
species = map_species[record[0]['taxon'].lower()]
if len(record[0]['PubMedIds'])==0: #discard studies with no pubmed citation
raise ValueError("No PubMed citation found for this study.")
pmid = int(record[0]['PubMedIds'][0])
if len(record[0]['Samples']) not in range(6, 25): #discard studies that dont have 6-24 samples
raise ValueError("Number of samples need to be within 6-24.")
def fetch_pubmed_metadata(pmid):
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
records = Entrez.read(handle)
article = records['PubmedArticle'][0]['MedlineCitation']['Article']
title = article['ArticleTitle']
journal = article['Journal']['Title']
journal_abbr = article['Journal']['ISOAbbreviation']
year = article['Journal']['JournalIssue']['PubDate'].get('Year', '')
volume = article['Journal']['JournalIssue'].get('Volume', '')
issue = article['Journal']['JournalIssue'].get('Issue', '')
pages = article.get('Pagination', {}).get('MedlinePgn', '')
authors = article.get('AuthorList', [])
# def format_author(author):
# initials = ''.join(author.get('Initials', ''))
# return f"{author['LastName']} {initials}"
def format_author(author):
initials = '. '.join(author.get('Initials', '')) + '.'
return f"{author['LastName']}, {initials}"
authors = [format_author(a) for a in authors]
return {
"title": title,
"journal": journal_abbr,
"year": year,
"volume": volume,
"issue": issue,
"pages": pages,
"authors": authors
}
pmdict = fetch_pubmed_metadata(pmid)
def format_apa_citation(metadata, pmid=None):
authors = metadata['authors']
if len(authors) <= 20:
if len(authors) > 1:
author_str = ', '.join(authors[:-1]) + ', & ' + authors[-1]
else:
author_str = authors[0]
else:
author_str = ', '.join(authors[:19]) + ', ... ' + authors[-1]
citation = (
f"{author_str} ({metadata['year']}). {metadata['title']}. "
f"*{metadata['journal']}*, {metadata['volume']}({metadata['issue']}), {metadata['pages']}."
)
if metadata.get('doi'):
citation += f" https://doi.org/{metadata['doi']}"
elif pmid:
citation += f" PMID: {pmid}"
return citation
citation = format_apa_citation(pmdict)
display(Markdown(f"# **Reanalysis of \"{pmdict['title']}\" by {pmdict['authors'][0]} et al., {pmdict['journal']}, {pmdict['year']}**"))
Reanalysis of "Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia." by Jiang, X. et al., iScience, 2024¶
link = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse}"
display(Markdown(f"{citation}"))
HTML(f'<a href="{link}" target="_blank">Visit GEO accession page</a>')
Jiang, X., Huang, K., Sun, X., Li, Y., Hua, L., Liu, F., Huang, R., Du, J., & Zeng, H. (2024). Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia.. iScience, 27(1), 108691.
from google import genai
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
#print(os.environ["GOOGLE_API_KEY"])
prompt = f'''
You are an expert academic writer. Your task is to reformat the provided research information into a concise abstract around 250 words following this exact template:
"In this study, <FIRST AUTHOR> et al. [1] profiled <CELLS AND CONDITIONS> to further our understanding of <TOPIC>. The reanalysis of this dataset include <FILL IN>"
Here is the contextual information:
Author: {pmdict['authors']}
Title: {pmdict['title']}
Summary: {record[0]['summary']}
In the reanalysis explanation, use the following information: the reanalysis is a full RNA-seq analysis pipeline that consists of: UMAP[2], PCA[3], t-SNE[4] plots of the samples; clustergram heatmap; differential gene expression analysis
for each pair of control and perturbation samples; Enrichment analysis for each gene signature using Enrichr [5, 6, 7]; Transcription factor analysis of gene signatures
using ChEA3 [8] ; Reverser and mimicker drug match analysis using L2S2 [9] and DRUG-seqr [10], both FDA and non-FDA approved. Results are provided as tables in addition to bar charts.
Please write the reanalysis as a complete paragraph with smoothly transitioning sentences. Use consistent, present tense.
Do not omit or change the ordering of the reference numbers.
Do not change the reference and insert it, in parentheses, where indicated.
Now, generate the abstract strictly following the template. Do not include any other text or introductory/concluding remarks.
'''
response = client.models.generate_content(
model="gemini-2.0-flash", contents=prompt
)
#print(response.text)
Abstract¶
display(Markdown(f"{response.text}\n*This abstract was generated with the assistance of Gemini 2.0 Flash.*"))
In this study, Jiang et al. [1] profiled acute myeloid leukemia (AML) cell lines under treatment with Hexamethylene amiloride (HA) and venetoclax, alone and in combination, to further our understanding of the therapeutic potential of targeting NHE1 in AML. The reanalysis of this dataset includes a comprehensive RNA-seq analysis pipeline consisting of UMAP (Uniform Manifold Approximation and Projection) [2], PCA (Principal Component Analysis) [3], and t-SNE (t-distributed Stochastic Neighbor Embedding) [4] plots for visualizing sample distributions. A clustergram heatmap provides an overview of gene expression patterns across samples. Differential gene expression analysis is performed for each control and perturbation sample pair. Enrichment analysis for each resulting gene signature is conducted using Enrichr [5, 6, 7]. Transcription factor analysis of these gene signatures is performed utilizing ChEA3 [8]. Furthermore, the reanalysis incorporates reverser and mimicker drug match analysis using L2S2 [9] and DRUG-seqr [10], considering both FDA-approved and non-FDA-approved compounds. Results are presented as tables and bar charts.
This abstract was generated with the assistance of Gemini 2.0 Flash.
Methods¶
RNA-seq alignment
Gene count matrices were obtained from ARCHS4 [11], which preprocessed the raw FASTQ data using the Kallisto [12] and STAR [] pseudoalignment algorithm.
Gene matrix processing
The raw gene matrix was filtered to remove genes that do not have an average of 3 reads across the samples. It was then quantile, log2, and z-score normalized. A regex-based function was used to infer whether individual samples belong to a “control” or a “perturbation” group by processing the metadata associated with each sample.
Dimensionality Reduction Visualization
Three types of dimensionality reduction techniques were applied on the processed expression matrices: UMAP[2], PCA[3], and t-SNE[4]. UMAP was calculated by the UMAP Python package and PCA and t-SNE were calculated using the Scikit-Learn Python library. The samples were then represented on 2D scatterplots.
Clustergram Heatmap
As a preliminary step, the top 1000 genes exhibiting most variability were selected. Using this new set, clustergram heatmaps were generated. Two versions of the clustergram exist: an interactive one generated by Clustergrammer [13] and a publication-ready alternative.
Differentially Expressed Genes Calculation and Volcano Plot
Differentially expressed genes between the control and perturbation samples were calculated using Limma Voom [14]. The logFC and -log10p values of each gene were visualized as a volcano scatterplot. Upregulated and downregulated genes were selected according to this criteria: p < 0.05 and |logFC| > 1.0.
Enrichr Enrichment Analysis
The upregulated and down-regulated sets were separately submitted to Enrichr [5, 6, 7]. These sets were compared against libraries from ChEA [8], ARCHS4 [12], Reactome Pathways [15], MGI Mammalian Phenotype [16], Gene Ontology Biological Processes [17], GWAS Catalog [18], KEGG [19, 20, 21], and WikiPathways [22]. The top matched terms from each library and their respective -log10p values were visualized as barplots.
Chea3 Transcription Factor Analysis
The upregulated and down-regulated sets were separately submitted to Chea3 [8]. These sets were compared against the libraries ARCHS4 Coexpression [12], GTEx Coexpression [23], Enrichr [5, 6, 7], ENCODE ChIP-seq [24, 25], ReMap ChIP-seq [26], and Literature-mined ChIP-seq. The top matched TFs were ranked according to their average score across each library and represented as barplots.
L2S2 and Drug-seqr drug analysis
The top 500 up and downregulated sets were submitted simulataneously to identify reverser and mimicker molecules, both FDA and non-FDA approved, from the L2S2 [9] and Drug-seqr [10] databases. The top matched molecules were compiled into tables and visualized as barplots.
%%capture
import json
from datetime import datetime
#write a json file as a catalog list.
metadata_path = Path(os.path.join(project_root, "public", "metadata.json"))
# 1. Load existing metadata (or create empty if file doesn't exist)
if metadata_path.exists():
with open(metadata_path, "r") as f:
metadata = json.load(f)
else:
metadata = {}
entry = {
"GSE": gse,
"author": ", ".join(pmdict['authors']),
"year": pmdict['year'],
"species": species,
"title": pmdict['title'],
"pmid": pmid,
"num_samps": len(record[0]['Samples']),
"samples": ", ".join(sorted([w['Accession'] for w in record[0]['Samples']])),
"citation": citation,
"notebook_path": f"{resource_path}/{gse}.ipynb",
#"report_path": f"{resource_path}/{gse}.html",
"timestamp": datetime.now().isoformat()
}
# 3. Add entry only if it doesn't already exist
if gse not in metadata:
metadata[gse] = entry
print(f"[INFO] Added metadata for {gse}")
else:
print(f"[INFO] {gse} already exists in metadata. Skipping update.")
# 4. Write updated metadata back to file
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=4)
tab_num = 1
fig_num = 1
save_formats = ['png', 'svg', 'jpeg']
import archs4py as a4
#file_path = a4.download.counts("human", path="", version="latest") #comment out if the file is already downloaded
file = os.path.join("/home/ajy20/projects/8--auto-playbook-geo-reports", "human_gene_v2.latest.h5")
metadata = a4.meta.series(file, gse)
%%capture
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
words_to_remove = ['experiement', 'tissue', 'type', 'batch', 'treatment', 'experiment', 'patient', 'batch', '1', '2', '3', '4', '5', '6', '7', '8', '9']
stopwords_plus = set(stopwords.words('english') + (words_to_remove))
pattern = r'[-,_.:]'
terms_to_remove = ["cell line", "cell type", "genotype", "treatment"]
pattern1 = r"\b(" + "|".join(map(re.escape, terms_to_remove)) + r")\b"
metadata["cleaned_characteristics"] = metadata["characteristics_ch1"].str.replace(
pattern1,
"",
flags=re.IGNORECASE,
regex=True
).str.replace(r"\s+", " ", regex=True).str.strip()
metadata['cleaned_characteristics'] = metadata['cleaned_characteristics'].apply(lambda x: re.sub(pattern, " ", x).strip().lower())
metadata['cleaned_characteristics'] = metadata['cleaned_characteristics'].apply(
lambda text: " ".join([word for word in text.split() if word not in stopwords_plus])
)
metadata['clean_title'] = metadata['title'].apply(lambda x: re.sub('[0-9]+', '', x))
metadata['clean_title'] = metadata['clean_title'].apply(lambda x: re.sub(pattern, " ", x).strip().lower())
metadata['clean_title'] = metadata['clean_title'].apply(
lambda text: " ".join([word for word in text.split() if word not in stopwords_plus])
)
#metadata
groups = metadata.groupby(by='clean_title', level=None)
ctrl_words = set(['wt', 'wildtype', 'control', 'cntrl', 'ctrl', 'uninfected', 'normal', 'untreated', 'unstimulated', 'shctrl', 'ctl', 'healthy', 'sictrl', 'sicontrol', 'ctr', 'wild', 'dmso', 'vehicle', 'naive'])
groupings = {}
for label, group in groups:
if len(group) not in {3, 4}: #enforce 3-4 samples per group
raise ValueError("Study does not have 3-4 samples per group")
groupings[label] = group['geo_accession'].tolist()
# print(groupings)
title_conditions = list(groupings.keys())
title_ctrl = []
for c in title_conditions:
if len(set(c.split()).intersection(ctrl_words)) > 0:
title_ctrl.append(c)
# print(title_conditions)
# print(title_ctrl)
og_labels = {}
labled_groupings = {}
for label in groupings:
samps = groupings[label]
data = list(map(lambda s: s.lower(), metadata.loc[samps]['characteristics_ch1'].values))
data_clean = []
for d in data:
data_clean.append(set(filter(lambda w: w not in stopwords_plus, re.sub(pattern, ' ', d).split())))
condition = set(data_clean[0])
for s in data_clean[1:]:
condition.intersection_update(s)
condition = ' '.join(list(condition))
labled_groupings[condition] = samps
og_labels[condition] = label
ch1_ctrl = []
ch1_conditions = list(labled_groupings.keys())
for condition in labled_groupings:
split_conditions = condition.lower().split()
if len(set(split_conditions).intersection(ctrl_words)) > 0:
ch1_ctrl.append(condition)
# print(og_labels)
# print(ch1_conditions)
# print(ch1_ctrl)
#must have 1-2 controls. Must have perturbation groups as well (not all groups can be controls).
def check_eligibility(conditions, ctrl_conditions):
if len(ctrl_conditions) not in range(1, 3) or len(ctrl_conditions) == len(conditions):
return False
else:
return True
ch1_eligibility = check_eligibility(ch1_conditions, ch1_ctrl)
title_eligibility = check_eligibility(title_conditions, title_ctrl)
#print(ch1_eligibility)
#rint(title_eligibility)
def compare_groups(title_ctrl, ch1_ctrl, og_labels):
#convert ch1 condition to corresponding title condition, check if their respective sample sets are equal
for c in ch1_ctrl:
ch1_corresponding = og_labels[c]
if set(groupings[ch1_corresponding]) != set(labled_groupings[c]):
return False
return True
if ch1_eligibility and title_eligibility:
if compare_groups(title_ctrl, ch1_ctrl, og_labels):
ctrl_conditions = title_ctrl
conditions = title_conditions
else:
raise Exception("Group Assignment Failed")
elif ch1_eligibility ^ title_eligibility:
if ch1_eligibility:
ctrl_conditions = ch1_ctrl
conditions = ch1_conditions
groupings = labled_groupings
else:
ctrl_conditions = title_ctrl
conditions = title_conditions
else:
raise Exception("Group Assignment Failed")
# print(ctrl_conditions)
# print(conditions)
%%capture
gene_matrix = a4.data.series(file, gse) #raw counts
gene_matrix.to_csv(os.path.join(resource_path, "raw_counts.csv"))
gene_matrix.head(5)
| GSM7884755 | GSM7884756 | GSM7884757 | GSM7884758 | GSM7884759 | GSM7884760 | GSM7884761 | GSM7884762 | GSM7884763 | GSM7884764 | GSM7884765 | GSM7884766 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TSPAN6 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 2 |
| TNMD | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| DPM1 | 1475 | 1322 | 1197 | 1580 | 1236 | 1349 | 1090 | 899 | 1048 | 1126 | 1334 | 1148 |
| SCYL3 | 417 | 291 | 281 | 357 | 288 | 300 | 385 | 296 | 349 | 264 | 241 | 260 |
| C1ORF112 | 368 | 304 | 305 | 524 | 310 | 390 | 374 | 255 | 270 | 142 | 176 | 128 |
display(Markdown(f"**table {tab_num}**: This is a preview of the first 5 rows of the raw RNA-seq expression matrix from {gse}."))
tab_num +=1
display(FileLink(os.path.join(resource_path, "raw_counts.csv"), result_html_prefix="Download raw counts: "))
table 1: This is a preview of the first 5 rows of the raw RNA-seq expression matrix from GSE247175.
# Remove genes with all-zero counts
filtered_matrix = gene_matrix.loc[gene_matrix.sum(axis=1) > 0, :]
# Then filter out low average expression
filtered_matrix = filtered_matrix.loc[filtered_matrix.mean(axis=1) >= 3, :]
from maayanlab_bioinformatics.normalization.log import log2_normalize
from maayanlab_bioinformatics.normalization.zscore import zscore_normalize
from maayanlab_bioinformatics.normalization.quantile_legacy import quantile_normalize
def normalize(gene_counts):
norm_exp = quantile_normalize(gene_counts)
norm_exp = log2_normalize(norm_exp)
norm_exp = zscore_normalize(norm_exp)
return norm_exp
%%capture
# from python_scripts.matrix import normalize
norm_matrix = normalize(filtered_matrix) #normalize the matrix for dim reduction and clustergram
def annotate_matrix(expr_df, groupings, ctrl_conditions):
sampdict = {}
for group in groupings.keys():
samps = groupings[group]
for samp in samps:
if group in ctrl_conditions:
sampdict[samp] = "control"
else:
sampdict[samp] = "perturbation"
annotat = pd.DataFrame.from_dict(sampdict, orient='index', columns=['group'])
anndict = {
'count': expr_df,
'annotations': annotat
}
return anndict
annotated_norm_matrix = annotate_matrix(norm_matrix, groupings, ctrl_conditions)
#print(annotated_norm_matrix['annotations'])
annotated_matrix = annotate_matrix(filtered_matrix.astype('int64'), groupings, ctrl_conditions) #filtered but not normalized
Results¶
save_html = True #if true, render plotly graphs as html and embed with ipython. else, use fig.show()
use_fig_plot = False #if true, render matplotlib graphs using show(), else it will render the saved pngs.
Dimensionality Reduction¶
UMAP¶
import python_scripts.visualizations as vis
vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="umap", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "umap.html")))
display(Image(os.path.join(resource_path, "umap.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot of a UMAP decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1
for fmt in save_formats:
display(FileLink(os.path.join(resource_path, f"umap.{fmt}"), result_html_prefix=f"Download UMAP figure as {fmt}: "))
Figure 1: This figure displays a 2D scatter plot of a UMAP decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.
PCA¶
vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="pca", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "pca.html")))
display(Image(os.path.join(resource_path, "pca.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot of a PCA decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1
for fmt in save_formats:
display(FileLink(os.path.join(resource_path, f"pca.{fmt}"), result_html_prefix=f"Download PCA figure as {fmt}: "))
Figure 2: This figure displays a 2D scatter plot of a PCA decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.
t-SNE¶
vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="tsne", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "tsne.html")))
display(Image(os.path.join(resource_path, "tsne.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot using a t-SNE decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1
for fmt in save_formats:
display(FileLink(os.path.join(resource_path, f"tsne.{fmt}"), result_html_prefix=f"Download t-SNE figure as {fmt}: "))
Figure 3: This figure displays a 2D scatter plot using a t-SNE decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.
Clustergram Heatmaps¶
from maayanlab_bioinformatics.normalization.filter import filter_by_var
norm_t1000 = annotated_norm_matrix['count'].copy()
norm_t1000 = filter_by_var(annotated_norm_matrix['count'], top_n=1000, axis=1)
norm_t1000.columns=metadata['title'].tolist()
t1000_path = os.path.join(resource_path, 'expression_matrix_top1000_genes.txt')
norm_t1000.to_csv(t1000_path, sep='\t')
import requests, json
clustergrammer_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'
r = requests.post(clustergrammer_url, files={'file': open(t1000_path, 'rb')})
link = r.text
from IPython.display import IFrame
display(IFrame(link, width="600", height="650"))
display(Markdown(f"**Figure {fig_num}**: The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment."))
fig_num+=1
Figure 4: The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.
from python_scripts import visualizations as vis
clustergram = vis.plot_clustergram(norm_t1000)
for fmt in save_formats:
file_name = os.path.join(resource_path, f"clustergram.{fmt}")
clustergram.write_image(file_name, width=700, height=700)
# if save_html:
# cluster_path = os.path.join(resource_path, "clustergram.html")
# clustergram.write_html(cluster_path)
# display(HTML(cluster_path))
# else:
# clustergram.show()
display(Image(os.path.join(resource_path, "clustergram.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: this figure is a clustergram produced with the graphing library Plotly. It sacrifices some interactivity for a more polished look."))
fig_num+=1
for fmt in save_formats:
file_name = os.path.join(resource_path, f"clustergram.{fmt}")
#clustergram.write_image(file_name, width=600, height=600)
display(FileLink(file_name, result_html_prefix=f"Download as {fmt}: "))
Figure 5: this figure is a clustergram produced with the graphing library Plotly. It sacrifices some interactivity for a more polished look.
Differentially Expressed Genes Calculation and Volcano Plots¶
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
sig_names = []
matrix = annotated_matrix['count']
dges = {}
seen = []
for condition in ctrl_conditions:
for condition2 in conditions:
if condition!=condition2 and {condition, condition2} not in seen:
seen.append({condition, condition2})
sig_name = f'{condition}-vs-{condition2}.tsv'
sig_names.append(sig_name)
try:
with suppress_output():
dge = limma_voom_differential_expression(
matrix[groupings[condition]],
matrix[groupings[condition2]],
voom_design=True,
)
if not dge.empty:
dge['logFC'] = dge['logFC'].round(2)
dge['AveExpr'] = dge['AveExpr'].round(2)
dge['t'] = dge['t'].round(2)
dge['B'] = dge['B'].round(2)
dges[sig_name] = dge
dge.to_csv(os.path.join(resource_path, sig_name), sep='\t')
else:
print('Empty dge returned for', sig_name)
except Exception as e:
print(e)
print('Error computing:', sig_name)
for sig_name in sig_names:
dge_path = os.path.join(resource_path, sig_name)
# table = pd.read_csv(dge_path, sep="\t")
table = dges[sig_name]
display(table.head(5))
display(Markdown(f"**Table {tab_num}**: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom."))
display(FileLink(dge_path, result_html_prefix="Download DGE table: "))
tab_num+=1
| logFC | AveExpr | t | P.Value | adj.P.Val | B | |
|---|---|---|---|---|---|---|
| gene_symbol | ||||||
| SRGN | 2.14 | 9.83 | 105.66 | 3.328260e-20 | 6.334484e-16 | 36.70 |
| CHI3L1 | 3.15 | 9.74 | 101.23 | 5.753391e-20 | 6.334484e-16 | 36.07 |
| FTH1 | 2.63 | 10.86 | 76.01 | 2.238429e-18 | 1.504113e-14 | 32.71 |
| HSPA8 | 1.85 | 9.18 | 74.83 | 2.732266e-18 | 1.504113e-14 | 32.46 |
| FTL | 1.46 | 11.82 | 72.27 | 4.263324e-18 | 1.877568e-14 | 31.81 |
Table 2: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.
| logFC | AveExpr | t | P.Value | adj.P.Val | B | |
|---|---|---|---|---|---|---|
| gene_symbol | ||||||
| HSPA8 | 1.35 | 8.98 | 32.01 | 3.204121e-13 | 7.055474e-09 | 20.70 |
| SCD | 1.21 | 8.72 | 26.74 | 2.851548e-12 | 3.139554e-08 | 18.65 |
| FADS2 | 1.32 | 7.59 | 18.41 | 2.516819e-10 | 2.792420e-07 | 14.26 |
| SQLE | 1.05 | 6.75 | 18.39 | 2.553503e-10 | 2.792420e-07 | 14.22 |
| INSIG1 | 1.36 | 6.41 | 18.11 | 3.059481e-10 | 2.929121e-07 | 14.02 |
Table 3: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.
| logFC | AveExpr | t | P.Value | adj.P.Val | B | |
|---|---|---|---|---|---|---|
| gene_symbol | ||||||
| TFRC | 0.99 | 8.48 | 28.64 | 2.873655e-12 | 1.581947e-08 | 18.67 |
| TUBB4B | 0.93 | 7.84 | 24.34 | 1.898124e-11 | 4.179669e-08 | 16.81 |
| LPCAT1 | 0.88 | 7.85 | 24.04 | 2.193322e-11 | 4.390631e-08 | 16.67 |
| SFPQ | 0.62 | 9.47 | 20.80 | 1.164689e-10 | 1.349813e-07 | 15.02 |
| MT-CO1 | 0.41 | 13.45 | 20.34 | 1.509704e-10 | 1.662185e-07 | 14.07 |
Table 4: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.
threshold=1.0
upreg = {}
downreg = {}
upreg_t500 = {}
downreg_t500 = {}
for sig_name in sig_names:
#dge = pd.read_csv(os.path.join(resource_path, sig_name), sep="\t").set_index("gene_symbol")
dge = dges[sig_name]
sig_name = sig_name.replace(".tsv", "")
up_genes = dge.loc[(dge['P.Value']<0.05) & (dge['logFC']>threshold), :].index.tolist()
down_genes = dge.loc[(dge['P.Value']<0.05) & (dge['logFC']<-threshold), :].index.tolist()
upreg[sig_name] = up_genes
downreg[sig_name] = down_genes
up_genes_t500 = dge.loc[(dge['P.Value']<0.05)].sort_values(by="logFC", ascending=False)[:500].index.tolist()
down_genes_t500 = dge.loc[(dge['P.Value']<0.05)].sort_values(by="logFC")[:500].index.tolist()
upreg_t500[sig_name] = up_genes_t500
downreg_t500[sig_name] = down_genes_t500
save_name = f"{sig_name}_volcano"
display(Markdown(f"**{sig_name}**"))
vis.plot_volcano(dge, threshold=threshold, save_formats=save_formats, save_name = save_name, save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison {sig_name}. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes."))
fig_num+=1
for fmt in save_formats:
file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
display(FileLink(file_name, result_html_prefix=f"Download volcano plot as {fmt}: "))
dmso-vs-combo
Figure 6: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-combo. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.
dmso-vs-ha
Figure 7: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-ha. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.
dmso-vs-ven
Figure 8: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-ven. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.
Enrichr: Enrichment Analysis¶
sig_names_clean = [name.replace('.tsv', '') for name in sig_names]
from python_scripts.enrichment import Enrichr_API, enrichr_figure
enrichr_libraries = ["ChEA_2022", "ARCHS4_TFs_Coexp", "Reactome_Pathways_2024", "MGI_Mammalian_Phenotype_Level_4_2024", "GO_Biological_Process_2025", "GWAS_Catalog_2023"]
if species == "human":
enrichr_libraries.extend(["WikiPathways_2024_Human", "KEGG_2021_Human"])
elif species == "mouse":
enrichr_libraries.extend(["WikiPathways_2024_Mouse", "KEGG_2019_Mouse"])
else:
raise Exception("Species not supported.")
enrichr_libraries.sort()
figure_file_format = save_formats
color = "tomato"
Upregulated Set¶
from IPython.display import Image
#upregulated results
for sig_name in sig_names_clean:
up_file_name = sig_name + ' up_enrichr_results'
final_output_file_names_up = [str(os.path.join(resource_path, up_file_name+'.'+file_type)) for file_type in figure_file_format]
uresults = Enrichr_API(upreg[sig_name], enrichr_libraries)
display(Markdown(f"#### **{sig_name}**"))
enrichr_figure(uresults[0],uresults[1],uresults[2],final_output_file_names_up, uresults[4],figure_file_format, color, show_plot=False)
display(Image(final_output_file_names_up[0], width=600)) #display the PNG
display(Markdown(f'**Figure {fig_num}**: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: ' + str('https://amp.pharm.mssm.edu/Enrichr/enrich?dataset='+ uresults[3])))
fig_num+=1
for name in final_output_file_names_up:
display(FileLink(name, result_html_prefix=f"Download figure as {name[name.rfind('.')+1:]}:"))
dmso-vs-combo¶
Figure 9: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=f46b595ce1c8abb5b5f47c6e078592d4
dmso-vs-ha¶
Figure 10: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=9e29b635bff0f028bf85ef8e41c65a83
dmso-vs-ven¶
Figure 11: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1
Downregulated Set¶
#downregulated results
for sig_name in sig_names_clean:
dn_file_name = sig_name + ' dn_enrichr_results'
final_output_file_names_dn = [str(os.path.join(resource_path, dn_file_name+'.'+file_type)) for file_type in figure_file_format]
dresults = Enrichr_API(downreg[sig_name], enrichr_libraries)
display(Markdown(f"#### **{sig_name}**"))
enrichr_figure(dresults[0],dresults[1],dresults[2],final_output_file_names_dn, dresults[4],figure_file_format, color, show_plot=False)
display(Image(final_output_file_names_dn[0], width=600)) #display the PNG
display(Markdown(f'**Figure {fig_num}**: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: ' + str('https://amp.pharm.mssm.edu/Enrichr/enrich?dataset='+ uresults[3])))
fig_num+=1
for name in final_output_file_names_dn:
display(FileLink(name, result_html_prefix=f"Download figure as {name[name.rfind('.')+1:]}:"))
dmso-vs-combo¶
Figure 12: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1
dmso-vs-ha¶
Figure 13: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1
dmso-vs-ven¶
Figure 14: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1
CHEA3: Transcription Factor Enrichment Analysis¶
Upregulated Set¶
import python_scripts.chea3 as chea
#TFs of upregulated genes
for sig_name in sig_names_clean:
save_name = sig_name + 'upchea'
up_results = chea.get_chea3_results(upreg[sig_name], 'query')
chea.mean_rank_bar(up_results, save_name=save_name, save_formats=save_formats, save_html=save_html, save_path=resource_path)
display(Markdown(f"#### **{sig_name}**"))
#if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries."))
fig_num+=1
for fmt in save_formats:
file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
display(FileLink(file_name, result_html_prefix=f"Download bar plot as {fmt}: "))
dmso-vs-combo¶
Figure 15: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
dmso-vs-ha¶
Figure 16: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
dmso-vs-ven¶
Figure 17: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
Downregulated Set¶
for sig_name in sig_names_clean:
save_name = sig_name + 'dnchea'
dn_results = chea.get_chea3_results(downreg[sig_name], 'query')
chea.mean_rank_bar(dn_results, save_name=save_name, save_formats=save_formats, save_html=save_html, save_path=resource_path)
display(Markdown(f"#### **{sig_name}**"))
#if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries."))
fig_num+=1
for fmt in save_formats:
file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
display(FileLink(file_name, result_html_prefix=f"Download bar plot as {fmt}: "))
dmso-vs-combo¶
Figure 18: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
dmso-vs-ha¶
Figure 19: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
dmso-vs-ven¶
Figure 20: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.
L2S2 and DRUG-seqr: Reverser and Mimicker Drugs¶
Reverser Results¶
from python_scripts.druganalysis import druganalysis
dbs = ['l2s2_fda', 'l2s2_all', 'drugseqr_fda', 'drugseqr_all']
for sig_name in sig_names_clean:
# up_genes_t500 = upreg_t500[sig_name]
# down_genes_t500 = downreg_t500[sig_name]
up_genes = upreg[sig_name]
down_genes = upreg[sig_name]
rev_drugs = druganalysis(geneset=up_genes_t500, geneset_dn=down_genes_t500, direction="reversers", save_path=resource_path, save_name=sig_name, tab_num=tab_num, fig_num=fig_num)
display(Markdown(f"#### **{sig_name}**"))
for db in dbs:
display(Markdown(f"**{db}**"))
try:
rev_drugs.display_table(db=db)
rev_drugs.display_barplot(db=db, save_path=resource_path, save_formats=save_formats)
except ValueError as e:
print("Caught error: ", e)
fig_num = rev_drugs.fig_num
tab_num = rev_drugs.tab_num
display(Markdown("---"))
dmso-vs-combo¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueReverse | adjPvalueReverse | oddsRatioReverse | reverserOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 8.99e-01 | 1 | 7.01e-01 | 8 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.00e+00 | 1 | 0.00e+00 | 0 | False | 712 |
Table 5: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.
Figure 21: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
dmso-vs-ha¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueReverse | adjPvalueReverse | oddsRatioReverse | reverserOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 8.99e-01 | 1 | 7.01e-01 | 8 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.00e+00 | 1 | 0.00e+00 | 0 | False | 712 |
Table 6: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.
Figure 22: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
dmso-vs-ven¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueReverse | adjPvalueReverse | oddsRatioReverse | reverserOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 8.99e-01 | 1 | 7.01e-01 | 8 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.00e+00 | 1 | 0.00e+00 | 0 | False | 712 |
Table 7: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.
Figure 23: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
Mimicker Results¶
for sig_name in sig_names_clean:
# up_genes_t500 = upreg_t500[sig_name]
# down_genes_t500 = downreg_t500[sig_name]
up_genes = upreg[sig_name]
down_genes = upreg[sig_name]
mim_drugs = druganalysis(geneset=up_genes_t500, geneset_dn=down_genes_t500, direction="mimickers", save_path=resource_path, save_name=sig_name, tab_num=tab_num, fig_num=fig_num)
display(Markdown(f"#### **{sig_name}**"))
for db in dbs:
display(Markdown(f"\n**{db}**"))
try:
mim_drugs.display_table(db=db)
mim_drugs.display_barplot(db=db, save_path=resource_path, save_formats=save_formats)
except ValueError as e:
print("Caught error: ", e)
fig_num = mim_drugs.fig_num
tab_num = mim_drugs.tab_num
display(Markdown("---"))
dmso-vs-combo¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueMimic | adjPvalueMimic | oddsRatioMimic | mimickerOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 3.77e-09 | 6.33e-03 | 3.24e+00 | 35 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.34e-08 | 1.13e-02 | 3.14e+00 | 34 | False | 712 |
Table 8: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.
Figure 24: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
dmso-vs-ha¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueMimic | adjPvalueMimic | oddsRatioMimic | mimickerOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 3.77e-09 | 6.33e-03 | 3.24e+00 | 35 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.34e-08 | 1.13e-02 | 3.14e+00 | 34 | False | 712 |
Table 9: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.
Figure 25: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
dmso-vs-ven¶
l2s2_fda
Caught error: No Results for l2s2_fda
l2s2_all
| perturbation | term | pvalueMimic | adjPvalueMimic | oddsRatioMimic | mimickerOverlap | approved | count | |
|---|---|---|---|---|---|---|---|---|
| 0 | SL-0101-1 | AICHI002_K562_4H_D11_SL-0101-1_0.04uM up | 3.77e-09 | 6.33e-03 | 3.24e+00 | 35 | False | 192 |
| 1 | GW-5074 | AICHI001_THP1_4H_N14_GW-5074_2.5uM up | 1.34e-08 | 1.13e-02 | 3.14e+00 | 34 | False | 712 |
Table 10: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.
Figure 26: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.
drugseqr_fda
Caught error: No Results for drugseqr_fda
drugseqr_all
Caught error: No Results for drugseqr_all
References¶
references = f'''
[1] {citation}
[2] McInnes L, Healy J, Saul N, Großberger L. UMAP: Uniform manifold approximation and projection. Journal of Open Source Software. 2018;3(29):861. doi:10.21105/joss.00861
[3] Clark NR, Ma’ayan A. Introduction to statistical methods to analyze large data sets: Principal Components Analysis. Science Signaling. 2011;4(190):tr3-tr3. doi:10.1126/scisignal.2001967
[4] van der Maaten L, Hinton G. Visualizing Data using t-SNE. Journal of Machine Learning Research. 2008;9(86):2579-2605.
[5] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013;128(14)
[6] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377.
[7] Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: 10.1002/cpz1.90
[8] Keenan AB, Torre D, Lachmann A, Leong AK, Wojciechowicz M, Utti V, Jagodnik K, Kropiwnicki E, Wang Z, Ma'ayan A (2019) ChEA3: transcription factor enrichment analysis by orthogonal omics integration. Nucleic Acids Research. doi: 10.1093/nar/gkz446
[9] Marino GB, Evangelista JE, Clarke DJB, Ma’ayan A. L2S2: chemical perturbation and CRISPR KO LINCS L1000 signature search engine. Nucleic Acids Res. 2025; gkaf373. doi:10.1093/nar/gkaf373
[10] Li J, Ho DJ, Henault M, et al. DRUG-seq Provides Unbiased Biological Activity Readouts for Neuroscience Drug Discovery. ACS Chem Biol. 2022;17(6):1401-1414. doi:10.1021/acschembio.1c00920
[11] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi: 10.1038/s41467-018-03751-6.
[12] Bray, N., Pimentel, H., Melsted, P. et al. Near-optimal probabilistic RNA-seq quantification. Nat Biotechnol 34, 525–527 (2016). https://doi.org/10.1038/nbt.3519
[13] Fernandez, N. F. et al. Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data. Sci. Data 4:170151 doi: 10.1038/sdata.2017.151 (2017).
[14] Ritchie ME, Phipson B, Wu D, Hu Y, Law CW, Shi W, Smyth GK. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015 Apr 20;43(7):e47. doi: 10.1093/nar/gkv007.
[15] Milacic M, Beavers D, Conley P, Gong C, Gillespie M, Griss J, Haw R, Jassal B, Matthews L, May B, Petryszak R, Ragueneau E, Rothfels K, Sevilla C, Shamovsky V, Stephan R, Tiwari K, Varusai T, Weiser J, Wright A, Wu G, Stein L, Hermjakob H, D’Eustachio P. The Reactome Pathway Knowledgebase 2024. Nucleic Acids Research. 2024. doi: 10.1093/nar/gkad1025.
[16] Eppig JT, Smith CL, Blake JA, Ringwald M, Kadin JA, Richardson JE, Bult CJ. Mouse Genome Informatics (MGI): Resources for Mining Mouse Genetic, Genomic, and Biological Data in Support of Primary and Translational Research. Methods Mol Biol. 2017;1488:47-73. doi: 10.1007/978-1-4939-6427-7_3.
[17] Ashburner M, Ball CA, Blake JA, Botstein D, Butler H, Cherry JM, Davis AP, Dolinski K, Dwight SS, Eppig JT, Harris MA, Hill DP, Issel-Tarver L, Kasarskis A, Lewis S, Matese JC, Richardson JE, Ringwald M, Rubin GM, Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000 May;25(1):25-9. doi: 10.1038/75556.
[18] Cerezo M, Sollis E, Ji Y, et al. The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity. Nucleic Acids Res. 2025;53(D1):D998-D1005. doi:10.1093/nar/gkae1070
[19] Kanehisa M, Furumichi M, Sato Y, Matsuura Y, Ishiguro-Watanabe M. KEGG: biological systems database as a model of the real world. Nucleic Acids Res. 2025;53(D1):D672-D677. doi:10.1093/nar/gkae909
[20] Kanehisa M, Goto S. KEGG: kyoto encyclopedia of genes and genomes. Nucleic Acids Res. 2000;28(1):27-30. doi:10.1093/nar/28.1.27
[21] Kanehisa M. Toward understanding the origin and evolution of cellular organisms. Protein Sci. 2019;28(11):1947-1951. doi:10.1002/pro.3715
[22] Pico AR, Kelder T, van Iersel MP, Hanspers K, Conklin BR, Evelo C. WikiPathways: pathway editing for the people. PLoS Biol. 2008 Jul 22;6(7):e184. doi: 10.1371/journal.pbio.0060184.
[23] GTEx Consortium. The Genotype-Tissue Expression (GTEx) project. Nat Genet. 2013 Jun;45(6):580-5. doi: 10.1038/ng.2653.
[24] ENCODE Project Consortium. An integrated encyclopedia of DNA elements in the human genome. Nature. 2012;489(7414):57-74. doi:10.1038/nature11247
[25] Luo Y, Hitz BC, Gabdank I, et al. New developments on the Encyclopedia of DNA Elements (ENCODE) data portal. Nucleic Acids Res. 2020;48(D1):D882-D889. doi:10.1093/nar/gkz1062
[26] Hammal F, de Langen P, Bergon A, Lopez F, Ballester B. ReMap 2022: a database of Human, Mouse, Drosophila and Arabidopsis regulatory regions from an integrative analysis of DNA-binding sequencing experiments. Nucleic Acids Res. 2022;50(D1):D316-D325. doi:10.1093/nar/gkab996
'''
display(Markdown(references))
[1] Jiang, X., Huang, K., Sun, X., Li, Y., Hua, L., Liu, F., Huang, R., Du, J., & Zeng, H. (2024). Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia.. iScience, 27(1), 108691.
[2] McInnes L, Healy J, Saul N, Großberger L. UMAP: Uniform manifold approximation and projection. Journal of Open Source Software. 2018;3(29):861. doi:10.21105/joss.00861
[3] Clark NR, Ma’ayan A. Introduction to statistical methods to analyze large data sets: Principal Components Analysis. Science Signaling. 2011;4(190):tr3-tr3. doi:10.1126/scisignal.2001967
[4] van der Maaten L, Hinton G. Visualizing Data using t-SNE. Journal of Machine Learning Research. 2008;9(86):2579-2605.
[5] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013;128(14)
[6] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377.
[7] Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: 10.1002/cpz1.90
[8] Keenan AB, Torre D, Lachmann A, Leong AK, Wojciechowicz M, Utti V, Jagodnik K, Kropiwnicki E, Wang Z, Ma'ayan A (2019) ChEA3: transcription factor enrichment analysis by orthogonal omics integration. Nucleic Acids Research. doi: 10.1093/nar/gkz446
[9] Marino GB, Evangelista JE, Clarke DJB, Ma’ayan A. L2S2: chemical perturbation and CRISPR KO LINCS L1000 signature search engine. Nucleic Acids Res. 2025; gkaf373. doi:10.1093/nar/gkaf373
[10] Li J, Ho DJ, Henault M, et al. DRUG-seq Provides Unbiased Biological Activity Readouts for Neuroscience Drug Discovery. ACS Chem Biol. 2022;17(6):1401-1414. doi:10.1021/acschembio.1c00920
[11] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi: 10.1038/s41467-018-03751-6.
[12] Bray, N., Pimentel, H., Melsted, P. et al. Near-optimal probabilistic RNA-seq quantification. Nat Biotechnol 34, 525–527 (2016). https://doi.org/10.1038/nbt.3519
[13] Fernandez, N. F. et al. Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data. Sci. Data 4:170151 doi: 10.1038/sdata.2017.151 (2017).
[14] Ritchie ME, Phipson B, Wu D, Hu Y, Law CW, Shi W, Smyth GK. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015 Apr 20;43(7):e47. doi: 10.1093/nar/gkv007.
[15] Milacic M, Beavers D, Conley P, Gong C, Gillespie M, Griss J, Haw R, Jassal B, Matthews L, May B, Petryszak R, Ragueneau E, Rothfels K, Sevilla C, Shamovsky V, Stephan R, Tiwari K, Varusai T, Weiser J, Wright A, Wu G, Stein L, Hermjakob H, D’Eustachio P. The Reactome Pathway Knowledgebase 2024. Nucleic Acids Research. 2024. doi: 10.1093/nar/gkad1025.
[16] Eppig JT, Smith CL, Blake JA, Ringwald M, Kadin JA, Richardson JE, Bult CJ. Mouse Genome Informatics (MGI): Resources for Mining Mouse Genetic, Genomic, and Biological Data in Support of Primary and Translational Research. Methods Mol Biol. 2017;1488:47-73. doi: 10.1007/978-1-4939-6427-7_3.
[17] Ashburner M, Ball CA, Blake JA, Botstein D, Butler H, Cherry JM, Davis AP, Dolinski K, Dwight SS, Eppig JT, Harris MA, Hill DP, Issel-Tarver L, Kasarskis A, Lewis S, Matese JC, Richardson JE, Ringwald M, Rubin GM, Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000 May;25(1):25-9. doi: 10.1038/75556.
[18] Cerezo M, Sollis E, Ji Y, et al. The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity. Nucleic Acids Res. 2025;53(D1):D998-D1005. doi:10.1093/nar/gkae1070
[19] Kanehisa M, Furumichi M, Sato Y, Matsuura Y, Ishiguro-Watanabe M. KEGG: biological systems database as a model of the real world. Nucleic Acids Res. 2025;53(D1):D672-D677. doi:10.1093/nar/gkae909
[20] Kanehisa M, Goto S. KEGG: kyoto encyclopedia of genes and genomes. Nucleic Acids Res. 2000;28(1):27-30. doi:10.1093/nar/28.1.27
[21] Kanehisa M. Toward understanding the origin and evolution of cellular organisms. Protein Sci. 2019;28(11):1947-1951. doi:10.1002/pro.3715
[22] Pico AR, Kelder T, van Iersel MP, Hanspers K, Conklin BR, Evelo C. WikiPathways: pathway editing for the people. PLoS Biol. 2008 Jul 22;6(7):e184. doi: 10.1371/journal.pbio.0060184.
[23] GTEx Consortium. The Genotype-Tissue Expression (GTEx) project. Nat Genet. 2013 Jun;45(6):580-5. doi: 10.1038/ng.2653.
[24] ENCODE Project Consortium. An integrated encyclopedia of DNA elements in the human genome. Nature. 2012;489(7414):57-74. doi:10.1038/nature11247
[25] Luo Y, Hitz BC, Gabdank I, et al. New developments on the Encyclopedia of DNA Elements (ENCODE) data portal. Nucleic Acids Res. 2020;48(D1):D882-D889. doi:10.1093/nar/gkz1062
[26] Hammal F, de Langen P, Bergon A, Lopez F, Ballester B. ReMap 2022: a database of Human, Mouse, Drosophila and Arabidopsis regulatory regions from an integrative analysis of DNA-binding sequencing experiments. Nucleic Acids Res. 2022;50(D1):D316-D325. doi:10.1093/nar/gkab996
# %%capture
# !jupyter nbconvert --to html --no-input "{gse}.ipynb"